# Python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
%matplotlib inline
import itertools
import lightgbm as lgbm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
from datetime import datetime
import lightgbm as lgbm
import warnings
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import warnings
from contextlib import contextmanager
@contextmanager
def timer(title):
t0 = time.time()
yield
print("{} - done in {:.0f}s".format(title, time.time() - t0))
warnings.filterwarnings('ignore') #ignore warning messages
#model csv
df = pd.read_csv('modelready.csv')
#preprocessing and getting ready to model
#train_test_split first - Churn is the target
X = df[['gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod', 'MonthlyCharges', 'TotalCharges',
'totalcharges_tenure_ratio', 'monthlyperc_of_total']]
y = df['Churn']
#first - onehotencode the string categorical values
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,stratify=y,random_state=30)
print(f'Train size:{X_train.shape}')
print(f'Test size:{X_test.shape}')
#I will start by creating a KMeans model to see if there is any value in clustering the customers
#K Means clustering - Choosing the correct value for cluster size
kinert = []
for k in range(1,15):
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
kinert.append(kmeans.inertia_)
# the best value is elbow value. It's 5.
plt.figure(figsize=(15,5))
plt.plot(range(1,15),kinert)
plt.xlabel("number of k (cluster) value")
plt.ylabel("Inertia")
plt.show()
From the Inertia of the models with k neighbours 1-15, we can reasonably assert that there are at least 2 clusters, however, it would be worth looking at 3 clusters because there still seems to be value there.
# Choose k = 2 based on previous visual as well as looking at the inertia and how it seems to reach an elbow around 3
#or 3 k's (will make 2 models )
#instantiate model
kmeans2 = KMeans(n_clusters=2)
kmeans3 = KMeans(n_clusters=3)
#fit model
kmeans2.fit(X)
kmeans3.fit(X)
#predict clusters
pred_2 = kmeans2.predict(X)
pred_3 = kmeans3.predict(X)
plt.figure(figsize=(16,16))
#tenure v monthlycharges - points coloured by cluster predictions of model
plt.subplot(4,2,1)
sns.scatterplot(x=X['tenure'],y=X['MonthlyCharges'],hue=pred_2,alpha=0.5)
plt.title('tenure v Monthly Charges colored by 2 clusters (predictions)')
#tenure v monthlycharges - points coloured by churn
plt.subplot(4,2,2)
sns.scatterplot(x=X['tenure'],y=X['MonthlyCharges'],hue=df['Churn'],alpha=0.5)
plt.title('tenure v Monthly Charges colored by actual Churn')
#tenure v totalcharges - cluster
plt.subplot(4,2,3)
sns.scatterplot(x=X['tenure'],y=X['TotalCharges'],hue=pred_2,alpha=0.5)
plt.title('tenure v Total Charges colored by 2 clusters (predictions)')
#tenure v totalcharges - churn
plt.subplot(4,2,4)
sns.scatterplot(x=X['tenure'],y=X['TotalCharges'],hue=df['Churn'],alpha=0.5)
plt.title('tenure v Total Charges colored by actual Churn')
#tenure v monthly - cluster
plt.subplot(4,2,5)
sns.scatterplot(x=X['tenure'],y=X['MonthlyCharges'],hue=pred_3,alpha=0.5)
plt.title('tenure v Monthly Charges colored by 3 clusters (predictions)')
#tenure v monthly - churn
plt.subplot(4,2,6)
sns.scatterplot(x=X['tenure'],y=X['MonthlyCharges'],hue=df['Churn'],alpha=0.5)
plt.title('tenure v Monthly Charges colored by actual Churn')
plt.subplot(4,2,7)
sns.scatterplot(x=X['tenure'],y=X['TotalCharges'],hue=pred_3,alpha=0.5)
plt.title('tenure v Total Charges colored by 3 clusters (predictions)')
plt.subplot(4,2,8)
sns.scatterplot(x=X['tenure'],y=X['TotalCharges'],hue=df['Churn'],alpha=0.5)
plt.title('tenure v Total Charges colored by actual Churn')
plt.savefig('saved_figs/clusters.png')
plt.tight_layout()
plt.show()
At first glance, it does not seem like the clusters do anything at all. However, I will provide more context into the findings by looking at the data itself to see how it has clustered the customers.
Another note that needs to be mentioned, this is an entirely unsupervised model, there is no scoring to be done because we have no actual clusters to predict against.
#Create a pipeline for modeling
#scaler, reducer and 2 models (2,3 clusters)
scaler = StandardScaler()
reducer = PCA()
model = KMeans(n_clusters = 2)
model2 = KMeans(n_clusters=3)
pipe = make_pipeline(scaler,reducer,model)
pipe2 = make_pipeline(scaler,reducer,model2)
#fit the pipeline
pipe.fit(X_train)
pipe2.fit(X_train)
pipd2 = pipe.predict(X_train)
pipd3 = pipe2.predict(X_train)
#replot previous subplots - ONLY 2 CLUSTERS
plt.figure(figsize=(12,12))
#tenure v monthlycharges - points coloured by cluster predictions of model
plt.subplot(2,2,1)
sns.scatterplot(x=X_train['tenure'],y=X_train['MonthlyCharges'],hue=pipd2,alpha=0.5)
plt.title('tenure v Monthly Charges colored by 2 clusters (predictions)')
#tenure v monthlycharges - points coloured by churn
plt.subplot(2,2,2)
sns.scatterplot(x=X_train['tenure'],y=X_train['MonthlyCharges'],hue=y_train,alpha=0.5)
plt.title('tenure v Monthly Charges colored by actual Churn')
#tenure v totalcharges - cluster
plt.subplot(2,2,3)
sns.scatterplot(x=X_train['tenure'],y=X_train['TotalCharges'],hue=pipd2,alpha=0.5)
plt.title('tenure v Total Charges colored by 2 clusters (predictions)')
#tenure v totalcharges - churn
plt.subplot(2,2,4)
sns.scatterplot(x=X_train['tenure'],y=X_train['TotalCharges'],hue=y_train,alpha=0.5)
plt.title('tenure v Total Charges colored by actual Churn')
plt.savefig('saved_figs/clusters-2.png')
plt.tight_layout()
plt.show()
#replot previous subplots - ONLY 3 CLUSTERS
plt.figure(figsize=(12,12))
#tenure v monthlycharges - points coloured by cluster predictions of model
plt.subplot(2,2,1)
sns.scatterplot(x=X_train['tenure'],y=X_train['MonthlyCharges'],hue=pipd3,alpha=0.5)
plt.title('tenure v Monthly Charges colored by 3 clusters (predictions)')
#tenure v monthlycharges - points coloured by churn
plt.subplot(2,2,2)
sns.scatterplot(x=X_train['tenure'],y=X_train['MonthlyCharges'],hue=y_train,alpha=0.5)
plt.title('tenure v Monthly Charges colored by actual Churn')
#tenure v totalcharges - cluster
plt.subplot(2,2,3)
sns.scatterplot(x=X_train['tenure'],y=X_train['TotalCharges'],hue=pipd3,alpha=0.5)
plt.title('tenure v Total Charges colored by 3 clusters (predictions)')
#tenure v totalcharges - churn
plt.subplot(2,2,4)
sns.scatterplot(x=X_train['tenure'],y=X_train['TotalCharges'],hue=y_train,alpha=0.5)
plt.title('tenure v Total Charges colored by actual Churn')
plt.savefig('saved_figs/clusters-3.png')
plt.tight_layout()
plt.show()
Scaling the model and applying a dimensionality reducer actually garners a much better result with 3 clusters. We can actually see a pattern, with 3 clusters it seems the model is much better at providing context.
From these plots we can see that there is one set of customers who seem to have low charges, no matter how long their tenure at the bottom of the graphs. The groups that interest us are those in the other 2 clusters where it seems there is 2 types. One that is low tenure and high charge (could they be plan hoppers?). The second are long tenure as well as high charge. Without more context on the customers here, I can't give more of an explanation. However, I will visualize a few other charts to see if there is more to the story.
# See how the 3 groups are different with the data
km_3 = X_train.copy()
km_3['Groups'] = pipd3
km_3['Churn'] = y_train
#a glance at the dataset grouped by cluster
#What pops out right away is the
GROUPS3 = km_3.groupby('Groups').mean().transpose()
#looking at these first few columns - we see that there is a significant difference in the 3 groups
GROUPS3.columns = ['Group1','Group2','Group3']
GROUPS3.head()
g0 = km_3.loc[km_3['Groups']==0]
g1 = km_3.loc[km_3['Groups']==1]
g2 = km_3.loc[km_3['Groups']==2]
for col in km_3.columns[:-2]:
plt.figure()
sns.distplot(g0[col],norm_hist=True,bins=10,label='Group1')
sns.distplot(g1[col],norm_hist=True,bins=10,label='Group2')
sns.distplot(g2[col],norm_hist=True,bins=10,label='Group3')
plt.title(f'Distribution of {col} for each group')
plt.legend()
plt.show
GROUPS3
data=df.copy()
#create a feature called engaged, Month-Month users would qualify as disengaged
data.loc[:,'Engaged']=1
data.loc[(data['Contract']=='Month-to-month'),'Engaged']=0
#Young and Not Engaged
data.loc[:,'YandNotE']=0
data.loc[(data['SeniorCitizen']==0) & (data['Engaged']==0),'YandNotE']=1
#Electronic payment
data.loc[:,'ElectCheck']=0
data.loc[(data['PaymentMethod']=='Electronic check') & (data['Engaged']==0),'ElectCheck']=1
#fiberoptic
data.loc[:,'fiberopt']=1
data.loc[(data['InternetService']!='Fiber optic'),'fiberopt']=0
#Users that use TV but not internet
data.loc[:,'StreamNoInt']=1
data.loc[(data['StreamingTV']!='No internet service'),'StreamNoInt']=0
#no online protection or tech support
data.loc[:,'NoProt']=1
data.loc[(data['OnlineBackup']!='No') | (data['DeviceProtection']!='No') | (data['TechSupport']!='No'),'NoProt']=0
#User of all services
data['TotalServices'] = (data[['PhoneService', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']]== 'Yes').sum(axis=1)
data.head() #can see the 7 new features created that will help eliminate waste
#binning tenure
data['tenure'] = pd.cut(data['tenure'], 3)
#drop columns
data = data.drop(columns = [
'Contract',
'DeviceProtection',
'Partner'
])
#encode data and scale
#customer id col
Id_col = ['customerID']
#Target columns
target_col = ["Churn"]
#categorical columns
cat_cols = data.nunique()[data.nunique() < 10].keys().tolist()
cat_cols = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols = [x for x in data.columns if x not in cat_cols + target_col + Id_col]
#Binary columns with 2 values
bin_cols = data.nunique()[data.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]
#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
data[i] = le.fit_transform(data[i])
#Duplicating columns for multi value columns
data = pd.get_dummies(data = data,columns = multi_cols )
#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)
#dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")
data = data.drop(['customerID'],axis = 1)
data.head() #left with a sparse matrix with 4 numeric columns, 2 of which were engineered during EDA
#create a correlation plot using pyplot for interactivity
def correlation_plot():
#correlation
correlation = data.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array = np.array(correlation)
trace = go.Heatmap(z = corr_array,
x = matrix_cols,
y = matrix_cols,
colorscale='Viridis',
colorbar = dict() ,
)
layout = go.Layout(dict(title = 'Correlation Matrix for variables',
#autosize = False,
#height = 1400,
#width = 1600,
margin = dict(r = 0 ,l = 210,
t = 25,b = 210,
),
yaxis = dict(tickfont = dict(size = 9)),
xaxis = dict(tickfont = dict(size = 9)),
)
)
fig = go.Figure(data = [trace],layout = layout)
py.iplot(fig)
correlation_plot()
#remove features with co-linearity to prevent overfitting
#Threshold for removing correlated variables
threshold = 0.9
# Absolute value correlation matrix
corr_matrix = data.corr().abs()
corr_matrix.head()
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print('There are %d columns to remove :' % (len(to_drop)))
data = data.drop(columns = to_drop)
to_drop
correlation_plot()
# Def X and Y
y = np.array(data.Churn.tolist())
data = data.drop('Churn', 1)
X = np.array(data.as_matrix())
# Train_test split
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
#Def model_performance to create clean looking report after training,testing
def model_performance(model) :
#Conf matrix
conf_matrix = confusion_matrix(y_test, y_pred)
trace1 = go.Heatmap(z = conf_matrix ,x = ["0 (pred)","1 (pred)"],
y = ["0 (true)","1 (true)"],xgap = 2, ygap = 2,
colorscale = 'Viridis', showscale = False)
#Show metrics
tp = conf_matrix[1,1]
fn = conf_matrix[1,0]
fp = conf_matrix[0,1]
tn = conf_matrix[0,0]
Accuracy = ((tp+tn)/(tp+tn+fp+fn))
Precision = (tp/(tp+fp))
Recall = (tp/(tp+fn))
F1_score = (2*(((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))))
show_metrics = pd.DataFrame(data=[[Accuracy , Precision, Recall, F1_score]])
show_metrics = show_metrics.T
colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue']
trace2 = go.Bar(x = (show_metrics[0].values),
y = ['Accuracy', 'Precision', 'Recall', 'F1_score'], text = np.round_(show_metrics[0].values,4),
textposition = 'auto', textfont=dict(color='black'),
orientation = 'h', opacity = 1, marker=dict(
color=colors,
line=dict(color='#000000',width=1.5)))
#Roc curve
model_roc_auc = round(roc_auc_score(y_test, y_score) , 3)
fpr, tpr, t = roc_curve(y_test, y_score)
trace3 = go.Scatter(x = fpr,y = tpr,
name = "Roc : " + str(model_roc_auc),
line = dict(color = ('rgb(22, 96, 167)'),width = 2), fill='tozeroy')
trace4 = go.Scatter(x = [0,1],y = [0,1],
line = dict(color = ('black'),width = 1.5,
dash = 'dot'))
# Precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
trace5 = go.Scatter(x = recall, y = precision,
name = "Precision" + str(precision),
line = dict(color = ('lightcoral'),width = 2), fill='tozeroy')
#Feature importance
coefficients = pd.DataFrame(eval(model).feature_importances_)
column_data = pd.DataFrame(list(data))
coef_sumry = (pd.merge(coefficients,column_data,left_index= True,
right_index= True, how = "left"))
coef_sumry.columns = ["coefficients","features"]
coef_sumry = coef_sumry.sort_values(by = "coefficients",ascending = False)
coef_sumry = coef_sumry[coef_sumry["coefficients"] !=0]
trace6 = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
name = "coefficients",
marker = dict(color = coef_sumry["coefficients"],
colorscale = "Viridis",
line = dict(width = .6,color = "black")))
#Cumulative gain
pos = pd.get_dummies(y_test).as_matrix()
pos = pos[:,1]
npos = np.sum(pos)
index = np.argsort(y_score)
index = index[::-1]
sort_pos = pos[index]
#cumulative sum
cpos = np.cumsum(sort_pos)
#recall
recall = cpos/npos
#size obs test
n = y_test.shape[0]
size = np.arange(start=1,stop=369,step=1)
#proportion
size = size / n
#plots
model = model
trace7 = go.Scatter(x = size,y = recall,
line = dict(color = ('gold'),width = 2), fill='tozeroy')
#Subplots
fig = tls.make_subplots(rows=4, cols=2, print_grid=False,
specs=[[{}, {}],
[{}, {}],
[{'colspan': 2}, None],
[{'colspan': 2}, None]],
subplot_titles=('Confusion Matrix',
'Metrics',
'ROC curve'+" "+ '('+ str(model_roc_auc)+')',
'Precision - Recall curve',
'Cumulative gains curve',
'Feature importance'
))
fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,2)
fig.append_trace(trace3,2,1)
fig.append_trace(trace4,2,1)
fig.append_trace(trace5,2,2)
fig.append_trace(trace6,4,1)
fig.append_trace(trace7,3,1)
fig['layout'].update(showlegend = False, title = '<b>Model performance report</b><br>'+str(model),
autosize = False, height = 1500,width = 830,
plot_bgcolor = 'black',
paper_bgcolor = 'black',
margin = dict(b = 195), font=dict(color='white'))
fig["layout"]["xaxis1"].update(color = 'white')
fig["layout"]["yaxis1"].update(color = 'white')
fig["layout"]["xaxis2"].update((dict(range=[0, 1], color = 'white')))
fig["layout"]["yaxis2"].update(color = 'white')
fig["layout"]["xaxis3"].update(dict(title = "false positive rate"), color = 'white')
fig["layout"]["yaxis3"].update(dict(title = "true positive rate"),color = 'white')
fig["layout"]["xaxis4"].update(dict(title = "recall"), range = [0,1.05],color = 'white')
fig["layout"]["yaxis4"].update(dict(title = "precision"), range = [0,1.05],color = 'white')
fig["layout"]["xaxis5"].update(dict(title = "Percentage contacted"),color = 'white')
fig["layout"]["yaxis5"].update(dict(title = "Percentage positive targeted"),color = 'white')
fig["layout"]["xaxis6"].update(color = 'white')
fig["layout"]["yaxis6"].update(color = 'white')
for i in fig['layout']['annotations']:
i['font'] = titlefont=dict(color='white', size = 14)
py.iplot(fig)
# Cross val metric
def cross_val_metrics(model) :
scores = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
for sc in scores:
scores = cross_val_score(model, X, y, cv = 5, scoring = sc)
print('[%s] : %0.5f (+/- %0.5f)'%(sc, scores.mean(), scores.std()))
#I will create a LGBM classifier to predict churn
lgbm_clf = lgbm.LGBMClassifier(n_estimators=1500, random_state = 42)
lgbm_clf.fit(X_train, y_train)
lgbm_clf.fit(X_train, y_train)
y_pred = lgbm_clf.predict(X_test)
y_score = lgbm_clf.predict_proba(X_test)[:,1]
model_performance('lgbm_clf')
The above model scored accuracy of 77% - we should aim for above 80% with a supervised learner.
To maximize the score, adjusting hyperparameters of the model is key. Below I have created a RandomSearchCV that will optimize parameters and score them, keeping track of the scores and will stop when it reaches the best score possible.
fit_params = {"early_stopping_rounds" : 50,
"eval_metric" : 'binary',
"eval_set" : [(X_test,y_test)],
'eval_names': ['valid'],
'verbose': 0,
'categorical_feature': 'auto'}
param_test = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 1000, 1500, 2000, 3000, 5000],
'num_leaves': sp_randint(6, 50),
'min_child_samples': sp_randint(100, 500),
'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
'subsample': sp_uniform(loc=0.2, scale=0.8),
'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
#number of combinations
n_iter = 200
#intialize lgbm and lunch the search
lgbm_clf = lgbm.LGBMClassifier(random_state=random_state, silent=True, metric='None', n_jobs=4)
grid_search = RandomizedSearchCV(
estimator=lgbm_clf, param_distributions=param_test,
n_iter=n_iter,
scoring='accuracy',
cv=5,
refit=True,
random_state=random_state,
verbose=10,n_jobs=2)
grid_search.fit(X_train, y_train, **fit_params)
print('Best params: {} '.format(grid_search.best_params_))
opt_parameters = grid_search.best_params_
#completed in 4.7 minutes - LGBM is a FAST computation model.
#Rebuild my LGBM with the best parameters from the randomsearch
lgbm_clf = lgbm.LGBMClassifier(**opt_parameters)
lgbm_clf.fit(X_train, y_train)
lgbm_clf.fit(X_train, y_train)
y_pred = lgbm_clf.predict(X_test)
y_score = lgbm_clf.predict_proba(X_test)[:,1]
model_performance('lgbm_clf')
The best_parameters yield a significant increase in precision,recall and f1.
Precision is how precise the model is (how often is the True Value = Predicted) Recall is how often the model predicted the True (1) in the data. This is harder to come by in an imbalanced model like we have here. F1 Score is essentially an average of the 2 and is often used along with precision and recall.
#Finally - Cross validate my model to and take the average of the results as my best model
cross_val_metrics(lgbm_clf)